/* Copyright (c) 2003 The Nutch Organization. All rights reserved. */ /* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */ package net.nutch.db; import java.io.*; import java.util.*; import net.nutch.io.*; import net.nutch.util.*; import net.nutch.pagedb.*; import net.nutch.linkdb.*; /********************************************** * The WebDBReader implements all the read-only * parts of accessing our web database. * All the writing ones can be found in WebDBWriter. * * @author Mike Cafarella **********************************************/ public class DistributedWebDBReader implements IWebDBReader { static final Page[] PAGE_RECORDS = new Page[0]; static final Link[] LINK_RECORDS = new Link[0]; // filenames static final String PAGES_BY_URL = "pagesByURL"; static final String PAGES_BY_MD5 = "pagesByMD5"; static final String LINKS_BY_URL = "linksByURL"; static final String LINKS_BY_MD5 = "linksByMD5"; static final String STATS_FILE = "stats"; static final String META_FILE = "metainfo"; // For different enumeration types static final EnumCall PAGE_ENUMS = new PageEnumCall(); static final EnumCall PAGE_MD5_ENUMS = new PageByMD5EnumCall(); static final EnumCall LINK_ENUMS = new LinkEnumCall(); // Utility array for Vector conversion static final DBSectionReader[] STATIC_SR_ARRAY = new DBSectionReader[0]; // Structures for multi-file db structures NutchFile dbDir; NutchFile globalWriteLock; DBSectionReader pagesByURL[], pagesByMD5[], linksByURL[], linksByMD5[]; long totalPages = 0, totalLinks = 0; int numMachines = 0; /** * Open a web db reader for the named directory. */ public DistributedWebDBReader(NutchFileSystem nutchfs, String dbName) throws IOException, FileNotFoundException { // // Get the current db from the given nutchfs. It consists // of a bunch of directories full of files. // this.dbDir = new NutchFile(nutchfs, dbName, "standard", new File("webdb")); // // Wait until the webdb is complete, by waiting till a given // file exists. // NutchFile dirIsComplete = new NutchFile(dbDir, "dbIsComplete"); nutchfs.get(dirIsComplete); // // Obtain non-exclusive lock on the webdb's globalWriteLock // so writers don't move it out from under us. // // REMIND - mjc - I think the locking here is suspect. /** this.globalWriteLock = new NutchFile(nutchfs, dbName, "standard", new File("globalWriteLock")); nutchfs.lock(globalWriteLock, false); **/ // // Load in how many segments we can expect // NutchFile machineInfo = new NutchFile(nutchfs, dbName, "standard", new File("machineinfo")); DataInputStream in = new DataInputStream(new FileInputStream(nutchfs.get(machineInfo))); try { in.readByte(); // version this.numMachines = in.readInt(); } finally { in.close(); } // // Find all the "section" subdirs. Each section will contain // one of the 4 tables we're after. Create one DBSectionReader // object for each table in each section. // Vector pagesByURL = new Vector(), pagesByMD5 = new Vector(), linksByMD5 = new Vector(), linksByURL = new Vector(); for (int i = 0; i < numMachines; i++) { // The relevant NutchFiles for each part of this db section NutchFile sectionDir = new NutchFile(dbDir, "dbsection." + i); NutchFile pagesByURLNF = new NutchFile(sectionDir, PAGES_BY_URL); NutchFile pagesByMD5NF = new NutchFile(sectionDir, PAGES_BY_MD5); NutchFile linksByURLNF = new NutchFile(sectionDir, LINKS_BY_URL); NutchFile linksByMD5NF = new NutchFile(sectionDir, LINKS_BY_MD5); // Create DBSectionReader object for each subtype pagesByURL.add(new DBSectionReader(nutchfs.get(pagesByURLNF), new UTF8.Comparator())); pagesByMD5.add(new DBSectionReader(nutchfs.get(pagesByMD5NF), new Page.Comparator())); linksByURL.add(new DBSectionReader(nutchfs.get(linksByURLNF), new Link.UrlComparator())); linksByMD5.add(new DBSectionReader(nutchfs.get(linksByMD5NF), new Link.MD5Comparator())); // Load in the stats file for the section NutchFile sectionStats = new NutchFile(sectionDir, STATS_FILE); in = new DataInputStream(new FileInputStream(nutchfs.get(sectionStats))); try { in.read(); // version this.totalPages += in.readLong(); this.totalLinks += in.readLong(); } finally { in.close(); } } // Put lists into array form this.pagesByURL = (DBSectionReader[]) pagesByURL.toArray(STATIC_SR_ARRAY); this.pagesByMD5 = (DBSectionReader[]) pagesByMD5.toArray(STATIC_SR_ARRAY); this.linksByURL = (DBSectionReader[]) linksByURL.toArray(STATIC_SR_ARRAY); this.linksByMD5 = (DBSectionReader[]) linksByMD5.toArray(STATIC_SR_ARRAY); } /** * Shutdown */ public void close() throws IOException { for (int i = 0; i < pagesByURL.length; i++) { pagesByURL[i].close(); pagesByMD5[i].close(); linksByURL[i].close(); linksByMD5[i].close(); } } /** * How many sections (machines) there are in this distributed db. */ public int numMachines() { return numMachines; } /** * Return the number of pages we're dealing with. */ public long numPages() { return totalPages; } /** * Return the number of links in our db. */ public long numLinks() { return totalLinks; } /** * Get Page from the pagedb with the given URL. */ public Page getPage(String url) throws IOException { Page result = null, target = new Page(); UTF8 searchURL = new UTF8(url); // Don't do linear search. Instead, jump to the // chunk that will have it. return pagesByURL[DBKeyDivision.findURLSection(url, numMachines)].getPage(searchURL, target); } /** * Get all the Pages according to their content hash. * Since items in the pagesByMD5 DBSectionReader array will * be sorted by ascending blocks of the content hash, * we know the results will come in sorted order. */ public Page[] getPages(MD5Hash md5) throws IOException { Vector resultSet = pagesByMD5[DBKeyDivision.findMD5Section(md5, numMachines)].getPages(md5); Page resultArray[] = new Page[resultSet.size()]; int i = 0; for (Enumeration e = resultSet.elements(); e.hasMoreElements(); i++) { resultArray[i] = (Page) e.nextElement(); } return resultArray; } /** * Test whether a certain piece of content is in the * database, but don't bother returning the Page(s) itself. * We need to test every DBSectionReader in pagesByMD5 until * we reach the end, or find a positive. */ public boolean pageExists(MD5Hash md5) throws IOException { return pagesByMD5[DBKeyDivision.findMD5Section(md5, numMachines)].pageExists(md5); } /** * Iterate through all the Pages, sorted by URL. * We need to enumerate all the Enumerations given * to us via a call to pages() for each DBSectionReader. */ public Enumeration pages() throws IOException { return new MetaEnumerator(pagesByURL, PAGE_ENUMS); } /** * Iterate through all the Pages, sorted by MD5. * We enumerate all the DBSectionReader Enumerations, * just as above. */ public Enumeration pagesByMD5() throws IOException { return new MetaEnumerator(pagesByMD5, PAGE_MD5_ENUMS); } /** * Get all the hyperlinks that link TO the indicated URL. */ public Link[] getLinks(UTF8 url) throws IOException { Vector resultSet = linksByURL[DBKeyDivision.findURLSection(url.toString(), numMachines)].getLinks(url); Link resultArray[] = new Link[resultSet.size()]; int i = 0; for (Enumeration e = resultSet.elements(); e.hasMoreElements(); ) { resultArray[i++] = (Link) e.nextElement(); } return resultArray; } /** * Grab all the links from the given MD5 hash. */ public Link[] getLinks(MD5Hash md5) throws IOException { Vector resultSet = linksByMD5[DBKeyDivision.findMD5Section(md5, numMachines)].getLinks(md5); Link resultArray[] = new Link[resultSet.size()]; int i = 0; for (Enumeration e = resultSet.elements(); e.hasMoreElements(); ) { resultArray[i++] = (Link) e.nextElement(); } return resultArray; } /** * Return all the links, by target URL */ public Enumeration links() throws IOException { return new MetaEnumerator(linksByURL, LINK_ENUMS); } // // The EnumCall class allows the creator of MetaEnumerator // to indicate how to get each enumeration. Will it be pages // or links? // static abstract class EnumCall { /** */ public EnumCall() { } /** * Subclasses override this for different kinds of MetaEnumerator * behavior. */ public abstract Enumeration getEnumeration(DBSectionReader reader) throws IOException; } // // For enumerating Pages // static class PageEnumCall extends EnumCall { /** */ public PageEnumCall() { } /** * Get the enum of Pages */ public Enumeration getEnumeration(DBSectionReader reader) throws IOException { return reader.pages(); } } // // For enumerating Pages // static class PageByMD5EnumCall extends EnumCall { /** */ public PageByMD5EnumCall() { } /** * Get the enum of Pages */ public Enumeration getEnumeration(DBSectionReader reader) throws IOException { return reader.pagesByMD5(); } } // // For enumerating Links // static class LinkEnumCall extends EnumCall { /** */ public LinkEnumCall() { } /** * Get the enum of Links */ public Enumeration getEnumeration(DBSectionReader reader) throws IOException { return reader.links(); } } // // MetaEnumerator uses the Enumerations from each // DBSectionReader in the passed-in DBSectionReader array. // class MetaEnumerator implements Enumeration { Enumeration enumerations[]; int curEnum = 0; /** * Create all the Enumerations from the given Sections */ public MetaEnumerator(DBSectionReader sections[], EnumCall enumCall) throws IOException { this.enumerations = new Enumeration[sections.length]; for (int i = 0; i < enumerations.length; i++) { enumerations[i] = enumCall.getEnumeration(sections[i]); } } /** * Go through all the DBSectionReader items in * pagesByURL, until we find one that hasMoreElements. * Or until we hit the end. */ public boolean hasMoreElements() { boolean result = false; // // Go through Enumerations until we find one with // hasMoreElements() == true. (Or until we run out // of Enumerations.) // for (; curEnum < enumerations.length; curEnum++) { result = enumerations[curEnum].hasMoreElements(); if (result) { break; } } return result; } /** * Exhaust the Objects we can receive from the * Enumerations array, via calls to nextElement(); */ public Object nextElement() { Object obj = null; // // Go through Enumerations until we find one with // a nextElement() to return. (Or until we run out.) // for (; curEnum < enumerations.length; curEnum++) { if (enumerations[curEnum].hasMoreElements()) { obj = enumerations[curEnum].nextElement(); if (obj != null) { break; } } } return obj; } } /** * The DistributedWebDBReader.main() provides some handy utility methods * for looking through the contents of the webdb. Hoo-boy! * * Note this only works for a completely-NFS deployment. */ public static void main(String argv[]) throws FileNotFoundException, IOException { if (argv.length < 2) { System.out.println("Usage: java net.nutch.db.DistributedWebDBReader <dbRoot> [-pageurl url] | [-pagemd5 md5] | [-dumppageurl] | [-dumppagemd5] | [-toppages <k>] | [-linkurl url] | [-linkmd5 md5] | [-dumplinks] | [-stats]"); return; } NutchFileSystem nutchfs = new NutchNFSFileSystem(new File(argv[0]), true); DistributedWebDBReader reader = new DistributedWebDBReader(nutchfs, "db"); try { if ("-pageurl".equals(argv[1])) { String url = argv[2]; System.out.println(reader.getPage(url.trim())); } else if ("-pagemd5".equals(argv[1])) { MD5Hash md5 = new MD5Hash(argv[2]); Page pages[] = reader.getPages(md5); System.out.println("Found " + pages.length + " pages."); for (int i = 0; i < pages.length; i++) { System.out.println("Page " + i + ": " + pages[i]); } } else if ("-dumppageurl".equals(argv[1])) { int i = 1; for (Enumeration e = reader.pages(); e.hasMoreElements(); i++) { Page page = (Page) e.nextElement(); System.out.println("Page " + i + ": " + page); System.out.println(); } } else if ("-dumppagemd5".equals(argv[1])) { int i = 1; for (Enumeration e = reader.pagesByMD5(); e.hasMoreElements(); i++) { Page page = (Page) e.nextElement(); System.out.println("Page " + i + ": " + page); System.out.println(); } } else if ("-toppages".equals(argv[1])) { int topSize = Integer.parseInt(argv[2]); // Create a sorted list SortedSet topSet = new TreeSet(new Comparator() { public int compare(Object o1, Object o2) { Page p1 = (Page) o1; Page p2 = (Page) o2; if (p1.getScore() < p2.getScore()) { return -1; } else if (p1.getScore() == p2.getScore()) { // If two scores are equal, we will // use regular Page comparison (which // uses URL as the primary key). We // don't want to uniquify by score! return p1.compareTo(p2); } else { return 1; } } } ); // Find the top "topSize" elts Page lowestPage = null; for (Enumeration e = reader.pages(); e.hasMoreElements(); ) { Page curPage = (Page) e.nextElement(); if (topSet.size() < topSize) { topSet.add(curPage); lowestPage = (Page) topSet.first(); } else if (lowestPage.getScore() < curPage.getScore()) { topSet.remove(lowestPage); topSet.add(curPage); lowestPage = (Page) topSet.first(); } } // Print them out int i = 0; for (Iterator it = topSet.iterator(); it.hasNext(); i++) { System.out.println("Page " + i + ": " + (Page) it.next()); System.out.println(); } } else if ("-linkurl".equals(argv[1])) { String url = argv[2]; Link links[] = reader.getLinks(new UTF8(url.trim())); System.out.println("Found " + links.length + " links."); for (int i = 0; i < links.length; i++) { System.out.println("Link " + i + ": " + links[i]); } } else if ("-linkmd5".equals(argv[1])) { MD5Hash fromID = new MD5Hash(argv[2]); Link links[] = reader.getLinks(fromID); System.out.println("Found " + links.length + " links."); for (int i = 0; i < links.length; i++) { System.out.println("Link " + i + ": " + links[i]); } } else if ("-dumplinks".equals(argv[1])) { int i = 1; for (Enumeration e = reader.links(); e.hasMoreElements(); i++) { Link link = (Link) e.nextElement(); System.out.println("Link " + i + ": " + link); System.out.println(); } } else if ("-stats".equals(argv[1])) { System.out.println("Stats for " + reader); System.out.println("-------------------------------"); System.out.println("Number of pages: " + reader.numPages()); System.out.println("Number of links: " + reader.numLinks()); System.out.println("Number of machines (sections): " + reader.numMachines()); } else { System.out.println("Sorry, no command with name " + argv[1]); } } finally { reader.close(); } } }